import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LassoCV
import re
from IPython.display import display
#import vaex
import os
print(os.getcwd())
os.chdir("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips")
#
pd.options.display.max_columns = None
pd.options.display.max_rows = None
months = ['jan','feb','mar','apr','may','jun','july','aug','sep','oct','nov1','nov15','nov19','dec']
la = 'la_'
names = {}
for i in months:
names[la +i] = pd.read_csv("listings_" + i +".csv")
spring_listing
cal = 'cal_'
current = {}
for i in summer:
current[cal+i] = pd.read_csv("calendar_2020_" + i +".csv")
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
for i in current:
current[i]['date'] = pd.to_datetime(current[i]['date'])
current[i]['weekday'] = current[i]['date'].apply(lambda x : x.weekday())
current[i]['weekdays'] = current[i]['weekday'].apply(lambda x : weekDays[x])
current_keys = list(current.keys())
current_summer = current['cal_july']
for i in current_keys:
current_summer= pd.concat([current_summer,current[i]], axis = 0)
current_summer.shape
current_summer.shape
current_summer.date = current_summer.date.dt.month
current_summer.date.value_counts()
current_df = current_summer[(current_summer['date'] == 7) | (current_summer['date'] == 8) | (current_summer['date'] == 9)]
current_df.shape
current_df.date.value_counts()
current_df.isnull().sum()
current_df = current_df.dropna()
current_df.shape
current_df['price'] = current_df['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
current_df['adjusted_price'] = current_df['adjusted_price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
current_df['available'] = current_df['available'].map(dict(f=1,t=0))
current_df.head()
current_df.shape
print(os.getcwd())
current_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2020.csv')
current_booked = current_df.groupby('date').agg({'available':sum})
current_listings = current_df.groupby('date')['price'].count().to_frame()
current_listings = current_listings.reset_index()
current_listings
current_booked = current_booked.reset_index()
current_booked
current_booked['booked_rate'] = round(current_booked['available'] / current_listings['price'],3)
current_booked_avg_price = current_df.groupby('date').agg({'price': mean})
current_booked_avg_price.reset_index(inplace = True)
current_booked_avg_price.price = round(current_booked_avg_price.price,2)
current_booked['avg_price'] = current_booked_avg_price.price
current_booked.rename(columns = {'available':'booked'},inplace= True)
print(os.getcwd())
current_booked.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2020_booked_rate.csv')
current_booked
current_weekdays_booked = current_df.groupby('weekdays').agg({'available':sum})
current_weekdays_listings = current_df.groupby('weekdays')['price'].count().to_frame()
current_weekdays_booked['booked_rate'] =round(current_weekdays_booked['available'] / current_weekdays_listings['price'],3)
current_weekdays_booked.reset_index(inplace = True)
current_weekdays_booked
summer_week_booked['booked_rate_2020'] = current_weekdays_booked['booked_rate']
summer_week_booked
summer_week_booked.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_week_2019_2020.csv')
print(os.getcwd())
current_weekdays_booked.sort_values('booked_rate',ascending =False)a
summer_week_booked
half_year= ['aug','sep','oct','nov1','nov15','nov19','dec','jan']
summer = ['july','aug','sep']
spring = ['jan','feb','mar']
fall = ['apr','may','jun']
calfall = 'fall_'
fall_month = {}
for i in fall:
fall_month[calfall +i] = pd.read_csv("calendar_"+i+".csv")
for i in fall_month:
fall_month[i]['date'] = pd.to_datetime(fall_month[i]['date'])
fall_month[i]['weekday'] = fall_month[i]['date'].apply(lambda x : x.weekday())
fall_month[i]['weekdays'] = fall_month[i]['weekday'].apply(lambda x : weekDays[x])
fall_month.keys()
fall_keys = list(fall_month.keys())
fall_df = fall_month['fall_apr']
for i in fall_keys:
fall_df = pd.concat([fall_df,fall_month[i]], axis = 0)
fall_df.shape
fall_df = fall_df.dropna()
fall_df.date = fall_df.date.dt.month
fall_df = fall_df[fall_df['date'].isin([4,5,6])]
fall_df.date.value_counts()
fall_df.price = fall_df.price.astype(str).str.replace("$","").str.replace(",","").astype(float)
fall_df.adjusted_price = fall_df.adjusted_price.astype(str).str.replace("$","").str.replace(",","").astype(float)
fall_df.available = fall_df.available.map(dict(f=1,t=0))
fall_df.head()
fall_cleaned = fall_df
fall_cleaned.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/fall_cleaned.csv', index = False)
fall_booked = fall_df.groupby('date').agg({'available':sum})
fall_listings = fall_df.groupby('date')['price'].count().to_frame()
fall_booked['booked_rate'] = round(fall_booked['available'] / fall_listings['price'],3)
fall_booked
winter = pd.read_csv('winter_cleaned_updated.csv')
spring = pd.read_csv('spring_cleaned.csv')
summer = pd.read_csv('summer_cleaned.csv')
fall = pd.read_csv('fall_cleaned.csv')
winter.head()
fall = fall_cleaned
spring.head()
summer.head()
summer_spring = spring.append(summer)
summer_spring.date.value_counts()
winter_fall = fall.append(winter)
winter_fall.shape
winter_fall.date.value_counts()
calendar_2019 = summer_spring.append(winter_fall)
calendar_2019.shape
calendar_2019.date.value_counts()
print(os.getcwd())
calendar_2019.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_2019_updated.csv")
calendar_2019.weekdays.dtype
calendar_2019.date = calendar_2019.date.astype('category')
calendar_2019.weekdays = calendar_2019.weekdays.astype('category')
calendar_2019.price.dtype
calendar_booked = calendar_2019.groupby('date').agg({'available':sum})
calendar_listings = calendar_2019.groupby('date')['price'].count().to_frame()
calendar_price = calendar_2019.groupby('date').agg({'price':mean()})
calendar_listings = calendar_listings.reset_index()\
calendar_listings
calendar_price = calendar_2019.groupby('date').agg({'price':mean})
calendar_price['price'] = round(calendar_price['price'],2)
calendar_price = calendar_price.reset_index()
calendar_booked['booked_rate'] = round(calendar_booked.available / calendar_listings.price,3)
calendar_booked_df = calendar_booked.reset_index()
calendar_booked_df['avg_price'] = calendar_price['price']
calendar_booked_df['listings'] = calendar_listings['price']
calendar_booked_df
print(os.getcwd())
calendar_booked_df.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_booked_df_with_listings.csv', index = False)
#calendar_booked_df.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_booked_df.csv', index = False)
calendar_2019_booked_rate = calendar_booked_df.sort_values('booked_rate',ascending = False)
import os
print(os.getcwd())
calendar_booked_df = pd.read_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_booked_df.csv')
calendar_2019 = pd.read_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_2019.csv')
calendar_2019 = pd.read_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_2019.csv')
.yearly_booked = calendar_2019.groupby(['date','weekdays']).agg({'available':sum})
calendar_2019.groupby('date')['price'].count().to_frame()
yearly_listings = calendar_2019.groupby(['date','weekdays'])['price'].count().to_frame()
yearly_booked['booked_rate'] = round(yearly_booked['available'] / yearly_listings['price'],3)
yearly_booked_df= yearly_booked.reset_index()
yearly_booked_rate = yearly_booked.reset_index()
yearly_booked_rate
yearly_booked_rate.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/yearly_booked_rate.csv', index = False)
yearly_booked_rate
month_1 = yearly_booked_rate[yearly_booked_rate['date'] ==1].sort_values('booked_rate',ascending = False)
month_1
sns.lineplot(data=month_1, x="weekdays", y="booked_rate")
calendar_2019.head()
calendar_weekdays_booked = calendar_2019.groupby('weekdays').agg({'available':sum})
calendar_weekdays_listings = calendar_2019.groupby('weekdays')['price'].count().to_frame()
calendar_weekdays_booked['booked_rate'] = round(calendar_weekdays_booked.available / calendar_weekdays_listings.price,3)
calendar_weekdays_avgprice = calendar_2019.groupby('weekdays').agg({'price':mean})
calendar_weekdays_avgprice = calendar_weekdays_avgprice.reset_index()
calendar_weekdays_avgprice['price'] = round(calendar_weekdays_avgprice['price'] ,2)
calendar_weekdays_total= calendar_weekdays_listings.reset_index()
calendar_weekdays_booked = calendar_weekdays_booked.reset_index()
calendar_weekdays_listings = calendar_weekdays_listings.reset_index()
calendar_weekdays_listings
calendar_weekdays_booked['listings'] = calendar_weekdays_listings['price']
calendar_weekdays_booked['avg_price'] = calendar_weekdays_avgprice['price']
calendar_weekdays_booked
print(os.getcwd())
calendar_weekdays_booked.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/calendar_weekdays_booked_final_with_avg_price.csv', index = False)
calendar_2019.dropna(inplace = True)
month_index = calendar_2019["date"].astype('category')
month_index.head()
import statsmodels.api as stat
month_dummies_new =pd.get_dummies(month_index)
month_dummies_new.head()
month_dummies.head()
month_dummies.head()
calendar_months_dummies= calendar_month_dummies.drop(calendar_month_dummies.columns[[0]], axis = 1)
calendar_months_dummies.head()
calendar_month_y = calendar_2019["price"]
calendar_month_y.shape
month_dummies_new.shape
if isinstance(calendar_2019, pd.DataFrame):
print('yes')
month_dummies.head()
month_ols= stat.OLS(calendar_month_y, month_dummies)
month_new_ols = stat.OLS(calendar_month_y, month_dummies_new)
month_new_ols_result = month_new_ols.fit()
print(month_new_ols_result.summary())
month_ols_result = month_ols.fit()
print(month_ols_result.summary())
calendar_2019.dtypes
booked_month = calendar_2019["available"].astype('category')
booked = calendar_2019["available"]
seasonality_booked_model= stat.OLS(booked, month_dummies)
seasonality_booked_result = seasonality_booked_model.fit()
print(seasonality_booked_result.summary())
weekdays_booked = stat.OLS(booked, weekdays_dummy)
weekdays_booked_result = weekdays_booked.fit()
print(weekdays_booked_result.summary())
print(seasonality_booked_result.summary())
calendar_weekdays_dummies = pd.get_dummies(calendar_2019['weekdays'])
calendar_weekdays = calendar_2019['weekdays'].astype('category')
weekdays_dummy = pd.get_dummies(calendar_weekdays)
calendar_2019.columns
weekdays_ols = stat.OLS(calendar_weekdays_y, weekdays_dummy)
weekdays_result = weekdays_ols.fit()
print(weekdays_result.summary())
weekdays_index = calendar_2019['weekday'].astype('category')
weekdays_dummies = pd.get_dummies(weekdays_index)
calendar_weekdays_y = calendar_2019['price']
calendar_weekdays_ols = stat.OLS(calendar_weekdays_y, weekdays_dummies)
calendar_weekdays_result = calendar_weekdays_ols.fit()
print(calendar_weekdays_result.summary())
calendar = pd.concat()
cal = 'cal_'
calendars = {}
for i in half_year:
calendars[cal+i] = pd.read_csv("calendar_" + i +".csv")
calspring = 'spring_'
spring_month = {}
for i in spring:
spring_month[calspring +i] = pd.read_csv("calendar_"+i+".csv")
for i in spring_month:
spring_month[i]['date'] = pd.to_datetime(spring_month[i]['date'])
spring_month[i]['weekday'] = spring_month[i]['date'].apply(lambda x : x.weekday())
spring_month[i]['weekdays'] = spring_month[i]['weekday'].apply(lambda x : weekDays[x])
spring_keys = list(spring_month.keys())
spring_df = spring_month['spring_jan']
for i in spring_keys:
spring_df = pd.concat([spring_df,spring_month[i]], axis = 0)
spring_df.shape
spring_df.isnull().sum()
spring_nona = spring_df.dropna()
spring_nona['date'] = spring_nona['date'].dt.month
spring_cleaned = spring_nona[(spring_nona['date'] == 1) |(spring_nona['date'] == 2) | (spring_nona['date'] == 3)]
spring_cleaned['price']= spring_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
spring_cleaned['adjusted_price'] = spring_cleaned['adjusted_price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
spring_cleaned.available = spring_cleaned.available.map(dict(f=1,t=0))
spring_category = ['date','weekday']
spring_cleaned[spring_category] = spring_cleaned[spring_category].astype('category')
spring_cleaned.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/spring_cleaned.csv', index = False)
spring_cleaned.head()
spring_cleaned.dtypes
spring_booked = spring_cleaned.groupby('date').agg({"available": sum})
spring_booked
spring_listed = spring_cleaned.groupby('date')['price'].count().to_frame()
spring_listed
spring_booked['booked_rate'] = round(spring_booked['available'] / spring_listed['price'],2)
spring_booked.reset_index(level = 0, inplace = True)
spring_booked.rename(columns = {"date":'month','available':'num_booked'})
spring_week_listing = spring_cleaned.groupby('weekdays')['price'].count().to_frame()
spring_week_booked = spring_cleaned.groupby('weekdays').agg({'available' : sum})
spring_week_booked['booked_rate'] = round(spring_week_booked['available'] / spring_week_listing['price'],3)
spring_week_booked.reset_index(level = 0, inplace= True)
spring_week_booked.rename(columns = {'available' : 'num_booked'})
spring_week_booked.sort_values('booked_rate', ascending =False)
print(os.getcwd())
os.chdir("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips")
winter = ['oct','nov19','dec']
calwinter = "winter_"
winter_month = {}
for i in winter:
winter_month[calwinter + i] = pd.read_csv("calendar_" + i +".csv")
for i in winter_month:
print(i)
for i in winter_month:
winter_month[i]['date'] = pd.to_datetime(winter_month[i]['date'])
winter_month[i]['weekday'] = winter_month[i]['date'].apply(lambda x : x.weekday())
winter_month[i]['weekdays'] = winter_month[i]['weekday'].apply(lambda x : weekDays[x])
winter_df = winter_month['winter_oct']
winter_keys = list(winter_month.keys())
for i in winter_month:
winter_df = pd.concat([winter_df,winter_month[i]], axis = 0)
winter_df.shape
winter_df['date'] = winter_df['date'].dt.month
winter_df['date'].value_counts()
winter_df = winter_df[(winter_df['date'] == 10) | (winter_df['date'] == 11) |(winter_df['date'] == 12)]
winter_df.shape
winter_df.isnull().sum()
winter_cleaned = winter_df.dropna()
winter_cleaned['price'] = winter_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
winter_cleaned['adjusted_price'] = winter_cleaned['adjusted_price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
winter_cleaned.dtypes
winter_cleaned['available'] = winter_cleaned['available'].map(dict(f=1,t=0))
winter_category = ['date','weekday']
winter_cleaned[winter_category] = winter_cleaned[winter_category].astype('category')
winter_cleaned.shape
winter_cleaned.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/winter_cleaned_updated.csv', index = False)
winter_booked = winter_cleaned.groupby('date').agg({"available":sum})
winter_listings = winter_cleaned.groupby('date')['price'].count().to_frame()
winter_booked['booked_rate'] = round(winter_booked['available'] / winter_listings['price'],2)
winter_booked.reset_index(level=0, inplace = True)
winter_booked
winter_booked = winter_booked.rename(columns = {'date':'month','available':'num_booked'})
winter_week_booked = winter_cleaned.groupby('weekdays').agg({'available' : sum})
winter_week_listing = winter_cleaned.groupby('weekdays')['price'].count().to_frame()
winter_week_booked['booked_rate'] = round( winter_week_booked['available'] / winter_week_listing['price'],3)
winter_week_booked.reset_index(level = 0, inplace=True)
winter_week_booked = winter_week_booked.rename(columns = {'available':'booked'})
winter_week_booked.sort_values('booked_rate', ascending = False)
winter_df.shape
summer_cleaned.shape
spring_cleaned.shape
winter_booked
summer_booked
spring_booked
#study summer price treand and booked rate first.
calsummer ='summer_'
months = {}
for i in summer:
months[calsummer + i] = pd.read_csv("calendar_" + i + ".csv")
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
for i in calendars:
calendars[i]['date'] = pd.to_datetime(calendars[i]['date'])
calendars[i]['weekday'] = calendars[i]['date'].apply(lambda x : x.weekday())
calendars[i]['weekdays'] = calendars[i]['weekday'].apply(lambda x : weekDays[x])
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
for i in months:
months[i]['date'] = pd.to_datetime(months[i]['date'])
months[i]['weekday'] = months[i]['date'].apply(lambda x : x.weekday())
months[i]['weekdays'] = months[i]['weekday'].apply(lambda x : weekDays[x])
def weekday(months):
weekDays = ("Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday")
for i in months:
months[i]['date'] = pd.to_datetime(months[i]['date'])
months[i]['weekday'] = months[i]['date'].apply(lambda x : x.weekday())
months[i]['weekdays'] = months[i]['weekday'].apply(lambda x : weekDays[x])
#calendars['cal_jan']['date'].dtype
months.keys()
summer_list = list(months.keys())
summer_df = months['summer_july']
for i in summer_list:
summer_df = pd.concat([summer_df, months[i]], axis = 0 )
summer_df.shape
summer_df['date']
summer_df['date'] = summer_df['date'].dt.month
summer_df['date'].value_counts()
summer_cleaned = summer_df[(summer_df['date'] == 7) | (summer_df['date'] == 8) | (summer_df['date'] == 9) ]
print("After filter out data of July, August, and September, the new dataset summer_cleaned has %s of rows." % summer_cleaned.shape[0])
summer_cleaned.isnull().sum()
summer_nona = summer_cleaned.dropna()
print("After dropping all rows with na values, the shape of the new summer_df is: %s" % summer_nona.shape[0])
print("The original shape is: %s" % summer_df.shape[0])
#duplicated_listing = summer_cleaned[summer_cleaned.duplicated()]
#duplicated_listing.head(50)
summer_cleaned.dtypes
summer_cleaned[summer_cleaned['price'] != summer_cleaned['adjusted_price']].head()
summer_cleaned['price'] = summer_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
summer_cleaned['adjusted_price'] = summer_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
print(os.getcwd())
summer_cleaned.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2019_cleaned.csv')
summer_cleaned.head()
summer_cleaned.weekdays = summer_cleaned.weekdays.astype('category')
summer_cleaned.weekday = summer_cleaned.weekday.astype('category')
summer_cleaned.date = summer_cleaned.date.astype('category')
summer_cleaned.dtypes
summer_cleaned.groupby('date')['price'].mean().sort_values(ascending = False)
summer_cleaned.available = summer_cleaned.available.map(dict(f=1,t=0))
summer_cleaned.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2019_cleaned_final.csv')
summer_cleaned.head()
summer_cleaned.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_cleaned.csv', index = False)
listings_grouped = summer_cleaned.groupby('date')['price'].count().to_frame()
available_grouped['available'] /listings_grouped['price']
listings_grouped
summer_booked = summer_cleaned.groupby('date').agg({'available' : sum})
summer_avg_price = summer_cleaned.groupby('date').agg({'price' : mean})
summer_avg_price
summer_booked['book_rate'] = round(summer_booked['available'] /listings_grouped['price'],2)
summer_booked['avg_price'] = round(summer_avg_price.price,2)
summer_booked.reset_index(inplace = True)
summer_booked
print(os.getcwd())
summer_booked.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2019_avg_price.csv')
summer_booked.reset_index(level = 0,inplace = True)
summer_booked = booked_grouped.rename(columns = {'date':'month','available':'num_booked'})
summer_booked['booked_rate_2020'] = current_booked['booked_rate']
summer_booked
summer_booked.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/summer_2019_2020.csv")
print(os.getcwd())
summer_week_booked = summer_cleaned.groupby('weekdays').agg({'available' : sum})
summer_week_listed = summer_cleaned.groupby('weekdays')['price'].count().to_frame()
summer_week_booked['booked_rate'] = round(summer_week_booked['available'] / summer_week_listed['price'],3)
summer_week_booked.reset_index(level = 0, inplace = True)
summer_week_booked = summer_week_booked.rename(columns = {'available':'num_booked'})
summer_week_booked
summer_week_booked.sort_values('booked_rate',ascending = False)
winter = ['oct','nov1','nov15','nov19','dec']
calendars.keys()
cal_list = list(calendars.keys())
cal_list
calendar_df = calendars['cal_jan']
for i in cal_list[1:]:
calendar_df = pd.concat([calendar_df, calendars[i]], axis = 0)
#calendar_df['listing_id'].value_counts().shape
calendar_df.shape
calendar_df.isnull().sum()
calendar_df[['weekdays','price']].groupby('weekdays').agg(lambda x: x.mean())
#airbnb_nomissing['id'].value_counts().shape
summer_listings = pd.concat([names['la_july'], names['la_aug'],names['la_sep']], axis=0)
airbnb_training.columns
summer_listings.head()
summer_listings = summer_listings[['id', 'host_is_superhost', 'host_total_listings_count', 'zipcode',
'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
'beds', 'amenities', 'price', 'cleaning_fee', 'guests_included',
'minimum_nights', 'maximum_nights', 'instant_bookable',
'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms']]
summer_merged['room_type'].value_counts()
summer_merged.isnull().sum()
summer_merged.shape
summer_merged = summer_merged.dropna()
summer_merged['amenities'] = summer_merged['amenities'].apply(lambda x : len(x.split(",")))
airbnb_subset = airbnb_partial_cleaned[airbnb_partial_cleaned.property_type
.isin(['Apartment',"House",'Condominium','Guesthouse'])]
summer_merged['property_type'].value_counts()
summer_merged_subset = summer_merged[summer_merged.property_type
.isin(['Apartment',"House",'Condominium','Guesthouse'])]
summer_merged_drop_columns = ['price_x', 'guests_included',
'minimum_nights_x', 'maximum_nights_x', 'listing_id','adjusted_price']
summer_merged_subset = summer_merged_subset.drop(columns = summer_merged_drop_columns)
summer_merged_subset.columns
airbnb_training.columns
summer_cleaned.head()
summer_merged = summer_listings.merge(summer_cleaned, left_on = 'id', right_on = 'listing_id')
summer_cleaned[summer_cleaned['listing_id']==109].shape
summer_cleaned['date'].value_counts()
summer_merged.head(100)
df1.merge(df2, left_on='lkey', right_on='rkey')
keys_list = list(names.keys())
airbnb_df = names['la_jan']
for i in keys_list[1:]:
airbnb_df = pd.concat([airbnb_df, names[i]], axis = 0)
airbnb_df.shape
print(os.getcwd())
airbnb_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_df_original.csv')
airbnb_df['state'].value_counts()
airbnb_ca = airbnb_df[airbnb_df['state'].isin(['CA','Ca','ca'])]
print(os.getcwd())
airbnb_ca.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_ca_only.csv")
print("The dataset before subsetting has %s" %airbnb_df.shape[0] + ' rows.')
print("The dateser after subsetting has %s" %airbnb_ca.shape[0] + ' rows.')
airbnb_ca.last_scraped.value_counts()
airbnb_ca = airbnb_ca[~airbnb_ca.last_scraped.isin(['2019-11-15','2019-11-20'])]
airbnb_ca.shape
airbnb_ca.head()
with_long_lat = airbnb_ca[columns_needed]
columns_needed = ['neighbourhood_cleansed','price','property_type','room_type','latitude',
'longitude']
with_long_lat[['neighbourhood_cleansed','property_type','room_type']].apply(pd.Series.nunique, axis = 'rows')
with_long_lat[['neighbourhood_cleansed','property_type','room_type']] = with_long_lat[['neighbourhood_cleansed','property_type','room_type']].astype('category')
property_type_table = with_long_lat.property_type.value_counts().to_frame()
property_type_df = property_type_table.reset_index()
property_type_df
with_long_lat_subset= with_long_lat[with_long_lat.property_type
.isin(['Apartment',"House",'Condominium','Guesthouse'])]
with_long_lat_subset.loc[:,"property_type"] = with_long_lat_subset.loc[:,"property_type"].cat.remove_unused_categories()
with_long_lat_nonas = with_long_lat_subset.dropna()
with_long_lat_nonas.shape
with_long_lat_nonas.price = with_long_lat_nonas.price.astype(str).str.replace("$","").str.replace(",","").astype(float)
with_long_lat_nonas.head()
print(os.getcwd())
with_long_lat_nonas.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/with_long_lat_nonas.csv")
print(os.getcwd())
property_type_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/property_type_df.csv')
# some of these features might be having missing values, and some of them might be not releveant to
#train machine learning model. Therefore, datacleaning is very necessary to prepare the data.
#The strategy used for cleaning the data is as follows:
#1. Drop columns that are not relevant to our objective. For example: Experiences_offer, URL
#2. Find missing values for each column after being cleaned from step1.
#3. Question here. what if some columns have so many missing values? which is more than 30% of the entire data?
#4. Covert columns to their correct data type.
#5.
#check how data types are distributed.
airbnb_ca.dtypes.value_counts()
airbnb_ca.select_dtypes(include = ['object']).columns
drop_object_list = ['listing_url','last_scraped','name','summary','description','experiences_offered','picture_url',
'host_url','host_name', 'host_since', 'host_location', 'host_about',
'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',
'street', 'neighbourhood','city', 'state','neighbourhood_group_cleansed','market','smart_location',
'country_code', 'country','calendar_updated','calendar_last_scraped',
'first_review', 'last_review',"jurisdiction_names",'is_business_travel_ready',
'interaction','access','transit','house_rules','cancellation_policy',
'host_has_profile_pic','host_verifications','require_guest_profile_picture',
'require_guest_phone_verification','bed_type','security_deposit','requires_license',
'host_has_profile_pic',"host_identity_verified","extra_people",
'has_availability','is_location_exact','neighborhood_overview','zipcode','instant_bookable'
]
## export this drop object list as csv and send it to DD.
airbnb_ca.select_dtypes(include=["float64"]).columns
#The reason that columns of latitude and longitude is not being dropped is they may be helpful when creating
# visualizations.
drop_float_list = ['thumbnail_url', 'medium_url', 'xl_picture_url',
'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
'host_listings_count','latitude',
'longitude','host_total_listings_count'
]
#does availability_30 and etc mean how many days the listing would be available
#within a year?
# calculated_host_listings_count. Do we need to keep this column? one-hot encoding is ready.
# what d
airbnb_ca[['guests_included','room_type', 'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']].head()
airbnb_ca.select_dtypes(include=["int64"]).columns
drop_int_list = ['host_id','scrape_id','minimum_minimum_nights','maximum_minimum_nights',
'minimum_maximum_nights','maximum_maximum_nights','calculated_host_listings_count',
"number_of_reviews","number_of_reviews_ltm",'availability_30', 'availability_60',
'availability_90', 'availability_365','calculated_host_listings_count',
'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms','maximum_nights','guests_included']
#airbnb_ca[['calculated_host_listings_count','calculated_host_listings_count_entire_homes',
#'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']].head()
#combine all of the columns which are supposed to be dropped.
drop_columns = drop_object_list + drop_float_list + drop_int_list
print(drop_columns)
print("Dropping %s columns" %len(drop_columns))
airbnb_ca_cleaned = airbnb_ca.drop(columns = drop_columns)
print("Shape fo the dataset after dropping : " , airbnb_ca_cleaned.shape)
missing_stats = airbnb_ca_cleaned.isnull().sum().to_frame()
missing_stats.shape
missing_stats.columns=["missing_num"]
missing_stats["missing_pctg"] = round(100* missing_stats["missing_num"] / airbnb_ca_cleaned.shape[0])
missing_stats.sort_values(by = "missing_num", ascending =False, inplace = True)
display(missing_stats)
drop_missing_values = list(missing_stats[missing_stats['missing_pctg'] >= 19].index)
print(drop_missing_values)
airbnb_partial_cleaned = airbnb_ca_cleaned.drop(columns= drop_missing_values)
airbnb_partial_cleaned.shape
1.some columns with object type should be changed to categorical type so as to run one-hot encoding when training the model
airbnb_partial_cleaned['price'] = airbnb_partial_cleaned['price'].astype(str).str.replace("$","").str.replace(",","").astype(float)
airbnb_partial_cleaned['cleaning_fee'] = airbnb_partial_cleaned['cleaning_fee'].astype(str).str.replace("$","").str.replace(",","").astype(float)
airbnb_partial_cleaned.columns
# which columns should be changed to categorical?
airbnb_partial_cleaned.select_dtypes(include = "object").head()
airbnb_partial_cleaned.select_dtypes(include = 'object').columns
categorical_cols = ['host_is_superhost', 'property_type','room_type',
'neighbourhood_cleansed'
]
airbnb_partial_cleaned[categorical_cols].dtypes
airbnb_partial_cleaned[categorical_cols] = airbnb_partial_cleaned[categorical_cols].apply(lambda x : x.astype('category'),
axis = 'rows')
airbnb_partial_cleaned[categorical_cols].dtypes
airbnb_partial_cleaned.host_is_superhost = airbnb_partial_cleaned.host_is_superhost.map(dict(f=0,t=1))
airbnb_partial_cleaned.host_is_superhost = airbnb_partial_cleaned.host_is_superhost.map(dict(f=0,t=1))
airbnb_partial_cleaned.shape
num_unique_values = airbnb_partial_cleaned[categorical_cols].apply(pd.Series.nunique, axis = 'rows')
num_unique_values
num_unique_values.plot(kind = 'bar')
plt.xlabel('labels')
plt.ylabel('Number of unique values')
plt.show()
#airbnb_partial_cleaned.to_csv("partial_cleaned_listings_without_subsetting.csv", index = False)
airbnb_subset = airbnb_partial_cleaned[airbnb_partial_cleaned.property_type
.isin(['Apartment',"House",'Condominium','Guesthouse'])]
airbnb_subset.shape
airbnb_subset.loc[:,"property_type"] = airbnb_subset.loc[:,"property_type"].cat.remove_unused_categories()
airbnb_subset['property_type'].value_counts()
airbnb_subset['amenities'] = airbnb_subset['amenities'].apply(lambda x : len(x.split(",")))
subset_missing = airbnb_subset.isnull().sum().to_frame()
subset_missing.columns = ["missing_values"]
subset_missing['percentage'] =(100 * subset_missing['missing_values'] / airbnb_subset.shape[0])
subset_missing
cols_missing_values = list(subset_missing[subset_missing.missing_values > 0].index)
airbnb_subset_missing_values = airbnb_subset[cols_missing_values]
host_cols = list(airbnb_subset_missing_values.columns[airbnb_subset_missing_values.columns.str.contains('host')])
host_cols
airbnb_subset_missing_values[host_cols][airbnb_subset_missing_values.host_is_superhost.isnull()].shape
airbnb_nomissing = airbnb_subset.dropna()
airbnb_nomissing.head()
airbnb_nomissing.columns
print("After dropping all of missing values for each column, the dataset named airbnb_nomissing \n has %s" % airbnb_nomissing.shape[0]
+ " rows."
)
a
#airbnb_training = airbnb_nomissing.drop(['latitude','longitude'], axis = 'columns')
#if "latitude" in airbnb_training.columns:
# print('yes')
#else:
# print('no')
airbnb_la = airbnb_nomissing
print(os.getcwd())
airbnb_la.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_la.csv', index = False)
print(os.getcwd())
airbnb_la = pd.read_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_la.csv")
categorical_cols = ['host_is_superhost', 'property_type','room_type',
'neighbourhood_cleansed'
]
airbnb_la.dtypes
airbnb_la.property_type.value_counts()
airbnb_la[airbnb_la['room_type'] == 'Hotel room']
airbnb_la_no_hotel =airbnb_la.drop([277461,277507])
airbnb_la.room_type.head()
categorical_cols
airbnb_la_no_hotel[categorical_cols] = airbnb_la_no_hotel[categorical_cols].astype('category')
no_hotel_onehot = pd.get_dummies(airbnb_la_no_hotel[categorical_cols])
airbnb_la_no_hotel.dtypes
no_hotel_onehot.shape
airbnb_nohotel_dummies = pd.concat([no_hotel_onehot, airbnb_la_no_hotel], axis = 'columns').drop(categorical_cols,
axis = 'columns')
airbnb_nohotel_dummies = airbnb_nohotel_dummies.drop(['id',axis = 1)
airbnb_nohotel_dummies.head()
no_hotel_price = airbnb_la_no_hotel['price']
no_hotel_ols = stat.OLS(no_hotel_price, airbnb_nohotel_dummies)
no_hotel_ols_result = no_hotel_ols.fit()
print(no_hotel_ols_result.summary())
airbnb_la[categorical_cols] = airbnb_la[categorical_cols].astype('category')
category_one_hot = pd.get_dummies(airbnb_la[categorical_cols])
category_one_hot.shape
category_one_hot.head()
airbnb_with_dummies = pd.concat([category_one_hot, airbnb_la], axis = 'columns').drop(categorical_cols,
axis = 'columns')
airbnb_with_dummies.shape
print(os.getcwd())
airbnb_with_dummies.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_with_dummies.csv", index = False)
airbnb_with_dummies.head()
### adding month columns
###correlation matrix
### atrributes affect the price.
len(airbnb_la.columns)
airbnb_la.columns
airbnb_la['room_type'].value_counts()
airbnb_la.groupby('room_type').agg({'price':mean})
airbnb_corr = airbnb_la.drop(['id','price'],axis =1)
import seaborn as sns
corr = airbnb_corr.corr()
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
);
, mask=np.zeros_like(corr, dtype=np.bool)
sns.heatmap(corr, annot = True, cmap=sns.diverging_palette(230, 20, as_cmap=True),
square=True, ax=ax)
sns.heatmap(corr, annot=True)
plot.show()
print(os.getcwd())
f, ax = plt.subplots(figsize=(9, 8))
corr = airbnb_corr.corr()
sns.heatmap(corr, annot = True, annot_kws={"size":13}, fmt=".2f",cmap=sns.diverging_palette(220, 10, as_cmap=True))
ax.set_ylim(np.array([-0.5, -.5])+ax.get_ylim())
plt.xticks(fontsize=13, rotation=45)
plt.yticks(fontsize=13)
plt.savefig('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/correlation_matrix2.jpeg',bbox_inches='tight')
sns.heatmap(airbnb_corr.corr(),mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax)
airbnb_la = pd.read_csv("airbnb_la.csv")
# Concatenate two tables together.
airbnb_with_dummies = pd.concat([category_one_hot, airbnb_la], axis = 'columns').drop(categorical_cols,
axis = 'columns')
airbnb_la.to_csv(r'/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/airbnb_la.csv', index = False)
print(os.getcwd())
airbnb_with_dummies.shape
airbnb_with_dummies.columns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
airbnb_with_dummies.head()
airbnb_with_dummies.head()
airbnb_extra_columns = ['price','id','accommodates']
drop_extra_columns = airbnb_with_dummies.drop(airbnb_extra_columns,axis = 1)
### check the number of hotal rooms
drop_extra_columns.head()
airbnb_no_beds_columns = ['price','id','beds',]
noscale_x_train = drop_extra_columns
if 'accommodates'.isin([noscale_x_train.columns]):
print('yes')
print(os.getcwd())
noscale_x_train.to_csv("/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/noscale_x_train_without_price_id_accommodates.csv")
noscale_x_train.head()
x_train= drop_extra_columns
x_train.head()
y_train= airbnb_with_dummies['price']
import statsmodels.api as stat
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
noscale_x_train.head()
noscale_x_train.columns
X = stat.add_constant(X, prepend=False)
noscale_x_train.head()
noscale_x_train = stat.add_constant(noscale_x_train, prepend = False)
noscale_x_train_1 = noscale_x_train.drop(['host_is_superhost_0','property_type_Apartment','room_type_Hotel room'], axis = 1)
ols2 = stat.OLS(y_train, noscale_x_train_1)
ols2_result = ols2.fit()
print(ols2_result.summary())
ols =stat.OLS(y_train, noscale_x_train)
ols_result =ols.fit()
### Email Jinan to ask about property type
### Drop the hotel room types as it only has two listings.
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
print(ols_result.summary())
d = {'col1': [1, 2], 'col2': [3, 4]}
factors = {'features':['room_type_Entire home/apt','room_type_Private room','room_type_Shared room','neighbourhood_cleansed_Avalon',
'Bel-Air','Beverly Crest','Beverly Hills','Century City','Encino','Hasley Canyon','Hollywood Hills West','Malibu',
'Pacific Palisades','Unincorporated Catalina Island','Unincorporated Santa Monica Mountains','Unincorporated Santa Susana Mountains',
'Vernon','Westlake Village','bathrooms','bedrooms','beds','cleaning_fee','minimum_nights','amenities'],
'coeffits':[77.4344,88.3273, 5.6587,139.7511, 573.3431 ,285.5859, 82.3291,286.9593, 84.4890,84.4743, 226.2936, 609.6742,
100.0329,404.3530,245.2676, 87.1175, 84.8923, 120.2296,139.7979,21.7480, -9.4181, 1.2046, -0.5185, -0.6230]}
factors_df = pd.DataFrame(data=factors)
factors_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/factors.csv')
print(os.getcwd('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/factors.csv'))
coef_df= pd.DataFrame(ols_result.summary().tables[1].data)
coef_df.columns = coef_df.iloc[0]
coef_df = coef_df.drop(0)
coef_df = coef_df.set_index(coef_df.columns[0])
coef_df = coef_df.astype(float)
errors = coef_df['coef'] - coef_df['[0.025']
coef_df['errors'] = errors
coef_df = coef_df.sort_values(by=['coef'])
variables = list(coef_df.index.values)
coef_df['variables'] = variables
coef_df
without_neighbours = ['room_type_Entire home/apt','room_type_Private room','room_type_Shared room',
'room_type_Hotel room', 'bathrooms','bedrooms','beds','cleaning_fee','minimum_nights',
'amenities','property_type_Apartment','property_type_Condominium','property_type_Guesthouse',
'property_type_House']
with_neighbours_coeffcis = coef_df.drop(without_neighbours)
with_neighbours_coeffcis
fig, ax = plt.subplots(figsize=(15, 10))
with_neighbours_coeffcis.plot(x='variables', y='coef', kind='bar',
ax=ax, color='none', fontsize=22,
ecolor='steelblue',capsize=0,
yerr='errors', legend=False)
plt.title('Coefficients of Features w/ 95% Confidence Intervals',fontsize=30)
ax.set_ylabel('Coefficients',fontsize=22)
ax.set_xlabel('',fontsize=22)
ax.scatter(x=pd.np.arange(coef_df.shape[0]),
marker='o', s=80,
y=coef_df['coef'], color='steelblue')
# Line to define zero on the y-axis
ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
without_neighbours_df = coef_df.loc[without_neighbours]
fig, ax = plt.subplots(figsize=(15, 10))
without_neighbours_df.plot(x='variables', y='coef', kind='bar',
ax=ax, color='none', fontsize=22,
ecolor='steelblue',capsize=0,
yerr='errors', legend=False)
plt.title('Coefficients of Features w/ 95% Confidence Intervals',fontsize=30)
ax.set_ylabel('Coefficients',fontsize=22)
ax.set_xlabel('',fontsize=22)
ax.scatter(x=pd.np.arange(coef_df.shape[0]),
marker='o', s=80,
y=coef_df['coef'], color='steelblue')
# Line to define zero on the y-axis
ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
coef_df
def coefplot(results):
coef_df= pd.DataFrame(ols_result.summary().tables[1].data)
coef_df.columns = coef_df.iloc[0]
coef_df = coef_df.drop(0)
coef_df = coef_df.set_index(coef_df.columns[0])
coef_df = coef_df.astype(float)
errors = coef_df['coef'] - coef_df['[0.025']
coef_df['errors'] = errors
coef_df = coef_df.sort_values(by=['coef'])
variables = list(coef_df.index.values)
coef_df['variables'] = variables
sns.set_context("poster")
fig, ax = plt.subplots(figsize=(15, 10))
coef_df.plot(x='variables', y='coef', kind='bar',
ax=ax, color='none', fontsize=22,
ecolor='steelblue',capsize=0,
yerr='errors', legend=False)
plt.title('Coefficients of Features w/ 95% Confidence Intervals',fontsize=30)
ax.set_ylabel('Coefficients',fontsize=22)
ax.set_xlabel('',fontsize=22)
ax.scatter(x=pd.np.arange(coef_df.shape[0]),
marker='o', s=80,
y=coef_df['coef'], color='steelblue')
# Line to define zero on the y-axis
ax.axhline(y=0, linestyle='--', color='red', linewidth=1)
return plt.show()
coefplot(ols_result)
pd.DataFrame(ols_result.summary().tables[1].data)
factors_df.to_csv('/Users/sitebai/Desktop/UCI/fall/bana212da/listing_zips/factors_df.csv')
factors.to_frame()
neighbourhood_cleansed_Avalon : 135.6014
ols_result.summary()[0]
from sklearn import metrics
y_train= airbnb_with_dummies['price']
no_sale_x_train
X_train, X_test, Y_train, Y_test = train_test_split(noscale_x_train, y_train, test_size = 0.2, random_state = 462)
airbnb_linreg = LinearRegression()
airbnb_linreg.fit(X_train, Y_train)
airbnb_linreg.score(X_train, Y_train)
airbnb_linreg.score(X_test,Y_test )
airbnb_y_predict = airbnb_linreg.predict(X_test)
print(metrics.mean_squared_error(Y_test,airbnb_y_predict))
np.sqrt(71950)
#please take the sqrt of mean
import math
math.sqrt(71950)
sirbnb_y_predict.score()
airbnb_dum
noscale_x_train.head()
y_train.shape
airbnb_linear = LinearRegression()
cv_linear = cross_val_score(airbnb_linear, noscale_x_train, y_train, scoring ='neg_mean_squared_error', cv = 10 )
mean(np.sqrt(absolute(cv_linear)))
noscale_x_train.head()
square_root_mse = np.sqrt(-cv_linear)
square_root_mse.mean()
airbnb_linear_regression = LinearRegression()
airbnb_linear_mse = cross_val_score(airbnb_linear_regression , noscale_x_train, y_train, scoring ='neg_mean_squared_error', cv =10)
airbnb_linear_mse
mean(absolute(airbnb_linear_mse))
mean(np.sqrt(absolute(airbnb_linear_mse )))
np.sqrt(-airbnb_linear_mse).mean()
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 462)
lin_reg = LinearRegression()
y = airbnb_with_dummies['price']
x = airbnb_with_dummies.drop('price',axis = 1)
scores = cross_val_score(lin_reg, x, y,scoring = "neg_mean_squared_error", cv=10)
#no need to sqrt of mse(double check)
lin_rmse_scores = np.sqrt(-scores)
def display_scores(scores):
print("Scores", scores)
print("Mean", scores.mean())
print("Standard Deviation:", scores.std())
display_scores(lin_rmse_scores)
from numpy import mean
from numpy import std
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from numpy import arange
airbnb_lasso1 = Lasso(alpha=1.0)
model = Lasso(alpha=1.0)
model_2 = Lasso(alpha=0.0)
model_2_results = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(model_2_results), std(model_2_results)))
scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
positive_scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(positive_scores), std(positive_scores)))
lasso_cv = LassoCV(alphas=arange(0, 1, 0.01), cv=10, n_jobs=-1)
lasso_cv.fit(x_train, y_train)
print('alpha: %f' % lasso_cv.alpha_)
after_scaling = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(after_scaling), std(after_scaling)))
x_train.head()
x_train[['bathrooms','bedrooms','beds','cleaning_fee','minimum_nights']] = scaler.fit_transform(x_train[['bathrooms','bedrooms','beds','cleaning_fee','minimum_nights']])
x_train.head()
#Standardize dataset first
scaler = StandardScaler()
scaler.transform()
airbnb_lasso1 = LassoCV(normalize = True, cv = 10).fit(x,y)
importance = np.abs(airbnb_lasso1.coef_)
#importance
from sklearn import linear_model
airbnb_lasso3 = linear_model.Lasso(alpha = 0.1)
airbnb_lasso2 = Lasso()
airbnb_lasso2.fit(dlx_train, dly_train)
train_score = lasso.score(dlx_test, dly_test)
#airbnb_lasso2 = Lasso(alpha = 0.1)
#cv = cross_val_score(airbnb_lasso2, x,y ,scoring='neg_mean_absolute_error', cv =10)
#cv = absolute(scores)
X_train, X_test, Y_train, Y_test = train_test_split(noscale_x_train, y_train, test_size = 0.2, random_state = 462)
aribnb_lasso2 = Lasso(alpha = 0.01)
aribnb_lasso2.fit(X_train, Y_train)
train_score = aribnb_lasso2.score(X_train, Y_train)
test_score = aribnb_lasso2.score(X_test, Y_test)
importance = np.abs(aribnb_lasso2.coef_)
len(importance[importance != 0])
281
# loop alpha 0.001,0.02, 0.5
#it'a suppoed to be lower than linear regression
print(train_score)
print(test_score)
airbnb_yhat = aribnb_lasso2.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(Y_test, airbnb_yhat)
np.sqrt(71963)
X_train, X_test, Y_train, Y_test = train_test_split(noscale_x_train, y_train, test_size = 0.2, random_state = 462)
alphas = [0.01, 0.001, 0.002]
lasso_scores = {}
lasso_mses= []
for i in alphas:
lasso_model = Lasso(alpha = i)
lasso_model.fit(X_train, Y_train)
train_score = lasso_model.score(X_train, Y_train)
test_score = lasso_model.score(X_test, Y_test)
lasso_scores[i] = (train_score, test_score)
y_hat = lasso_model.predict(X_test)
lasso_mses.append(mean_squared_error(Y_test,y_hat))
lasso_scores = {}
lasso_mses= []
for i in range(1, 51):
j = 0.001 * i
lasso_model = Lasso(alpha = j)
lasso_model.fit(X_train, Y_train)
train_score = lasso_model.score(X_train, Y_train)
test_score = lasso_model.score(X_test, Y_test)
lasso_scores[i] = (train_score, test_score)
y_hat = lasso_model.predict(X_test)
lasso_mses.append(mean_squared_error(Y_test,y_hat))
lasso_scores
lasso_mses
np.sqrt(71988)
alphas = [0.001, 0.02, 1]
lasso_scores = {}
lasso_mses= []
for i in alphas:
lasso_model = Lasso(alpha = i)
lasso_model.fit(X_train, Y_train)
train_score = lasso_model.score(X_train, Y_train)
test_score = lasso_model.score(X_test, Y_test)
lasso_scores[i] = (train_score, test_score)
y_hat = lasso_model.predict(X_test)
lasso_mses.append(mean_squared_error(Y_test,y_hat))
lasso_scores
lasso_mses
#!pip install tensorflow
import tensorflow as tf
#!pip install Keras
import keras
from keras.layers import Dense
from keras.models import Sequential
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import tranin_test_split
dlx_train, dlx_test, dly_train, dly_test = train_test_split(x,y, test_size = 0.2, random_state = 462)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
k_fold = StratifiedKFold(n_splits=10, shuffle=True)
len(x.columns)
for train, test in k_fold.split(x, y):
print (len(train))
x.head()
y.head()
#Standardize dataset first
scaler = StandardScaler()
dlx_train = scaler.fit_transform(dlx_train.astype(np.float))
dlx_test = scaler.transform(dlx_test.astype(np.float))
#besides dense, what else could be used here?
#change the input number
#dropout?
x.shape
n_cols = x_train.shape[1]
n_cols
airbnb_deep_model = Sequential()
airbnb_deep_model.add(Dense(200, activation = 'relu', input_shape = (n_cols,)))
airbnb_deep_model.add(Dense(200, activation = 'relu'))
airbnb_deep_model.add(Dense(200, activation = 'relu'))
airbnb_deep_model.add(Dense(200, activation = 'relu'))
airbnb_deep_model.add(Dense(1))
airbnb_deep_model.compile(optimizer = 'Adam', loss ='mean_squared_error')
#batch_size double check
airbnb_fit= airbnb_deep_model.fit(x,y,validation_split = 0.2)
#airbnb_deep_model.summary()
print(np.sqrt(63986))
print(np.sqrt(72638))
# predict ()
mse_score = airbnb_deep_model.evaluate(x,y)
mse_score
airbnb_la.price.describe()
airbnb_la[airbnb_la['price'] > 1.5* iqr + 189].shape
airbnb_la[airbnb_la['price'] < 1.5* iqr -75].shape
airbnb_la.shape
q75, q25 = np.percentile(airbnb_la['price'], [75 ,25])
iqr = q75 - q25
import seaborn as sns
sns.set_theme(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.boxplot(x=tips["total_bill"])
import seaborn as sns
sns.set_theme(style="whitegrid")
airbnb_la.head()
box_plot = sns.boxplot(x = airbnb_la.room_type, y = airbnb_la.price)
airbnb_la.head()
# what property has the highest average price?
airbnb_nomissing[['property_type','price']].groupby('property_type').agg(lambda x : x.mean())
airbnb_nomissing[['room_type','price']].groupby('room_type').agg(lambda x : x.mean())
import matplotlib.pyplot as plt
import seaborn as sns
property_superhost = airbnb_nomissing[['property_type','host_is_superhost']].groupby('property_type').agg(lambda x : len(x == 't') )
property_superhost
airbnb_nomissing[['property_type','price']].groupby('property_type').agg(lambda x : x.mean())
##line graph
##
room_superhost = airbnb_nomissing[['room_type','host_is_superhost']].groupby('room_type').agg(lambda x : len(x == 't') )
property_room_superhost =airbnb_nomissing.groupby(['property_type','room_type'])['host_is_superhost'].agg(lambda x : len(x == 't')).to_frame()
airbnb_with_dummies.select_dtypes(include = 'object').columns
airbnb_training.head()
neighbour_property = airbnb_training.groupby('neighbourhood_cleansed')['property_type'].count().sort_values(ascending = False).to_frame()
neighbour_property = airbnb_la.groupby('neighbourhood_cleansed').agg({'price':mean})
neighbour_property = neighbour_property.reset_index()
sorted_average_price = neighbour_property.sort_values(by = 'price', ascending = False)
sorted_average_price
neighbour_property = neighbour_property.reset_index()
neighbour_listings = neighbour_property.rename(columns = {'property_type':'num_listings'})
airbnb_training.head()
neighbour_property.head()
import folium
gjson2 = r'https://apps.gis.ucla.edu/geodata/dataset/93d71e41-6196-4ecb-9ddd-15f1a4a7630c/resource/6cde4e9e-307c-477d-9089-cae9484c8bc1/download/la-county-neighborhoods-v6.geojson'
area_map = folium.Map(location =[34.0522, -118.2437], zoom_start = 12 )
area_map.choropleth(
geo_data=gjson2,
data=sorted_average_price,
columns=['neighbourhood_cleansed','price'],
key_on='feature.properties.name',
fill_color='YlOrRd',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Average Listing price in Los Angeles'
)
area_map
airbnb_nomissing['id'].value_counts().shape
len(airbnb_nomissing['id'])
airbnb_nomissing.zipcode.dtypes
zipcode_priceavg = airbnb_nomissing[['zipcode','price']].groupby('zipcode').agg(lambda x: x.mean())
#missing_stats.sort_values(by = "missing_num", ascending =False, inplace = True)
zipcode_priceavg.sort_values(by = 'price',ascending = False, inplace = True)
zipcode_priceavg.head()
if "default zip code" in airbnb_nomissing['zipcode']:
print("yes")
airbnb_nomissing[airbnb_nomissing['zipcode'] == '-- default zip code --'][['zipcode','price',
'property_type','room_type']]
airbnb_nomissing.head()